/*
 * Copyright (c) 2007, 2015, Oracle and/or its affiliates. All rights reserved.
 * ORACLE PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
 */
/*
 * Copyright 1999-2004 The Apache Software Foundation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.sun.org.apache.regexp.internal;

import com.sun.org.apache.regexp.internal.RE;
import java.util.Hashtable;

/**
 * A regular expression compiler class.  This class compiles a pattern string into a
 * regular expression program interpretable by the RE evaluator class.  The 'recompile'
 * command line tool uses this compiler to pre-compile regular expressions for use
 * with RE.  For a description of the syntax accepted by RECompiler and what you can
 * do with regular expressions, see the documentation for the RE matcher class.
 *
 * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
 * @author <a href="mailto:gholam@xtra.co.nz">Michael McCallum</a>
 * @see RE
 * @see recompile
 */
public class RECompiler {

  // The compiled program
  char[] instruction;                                 // The compiled RE 'program' instruction buffer
  int lenInstruction;                                 // The amount of the program buffer currently in use

  // Input state for compiling regular expression
  String pattern;                                     // Input string
  int len;                                            // Length of the pattern string
  int idx;                                            // Current input index into ac
  int parens;                                         // Total number of paren pairs

  // Node flags
  static final int NODE_NORMAL = 0;                 // No flags (nothing special)
  static final int NODE_NULLABLE = 1;                 // True if node is potentially null
  static final int NODE_TOPLEVEL = 2;                 // True if top level expr

  // Special types of 'escapes'
  static final int ESC_MASK = 0xffff0;           // Escape complexity mask
  static final int ESC_BACKREF = 0xfffff;           // Escape is really a backreference
  static final int ESC_COMPLEX = 0xffffe;           // Escape isn't really a true character
  static final int ESC_CLASS = 0xffffd;           // Escape represents a whole class of characters

  // {m,n} stacks
  int maxBrackets = 10;                               // Maximum number of bracket pairs
  static final int bracketUnbounded = -1;             // Unbounded value
  int brackets = 0;                                   // Number of bracket sets
  int[] bracketStart = null;                          // Starting point
  int[] bracketEnd = null;                            // Ending point
  int[] bracketMin = null;                            // Minimum number of matches
  int[] bracketOpt = null;                            // Additional optional matches

  // Lookup table for POSIX character class names
  static Hashtable hashPOSIX = new Hashtable();

  static {
    hashPOSIX.put("alnum", new Character(RE.POSIX_CLASS_ALNUM));
    hashPOSIX.put("alpha", new Character(RE.POSIX_CLASS_ALPHA));
    hashPOSIX.put("blank", new Character(RE.POSIX_CLASS_BLANK));
    hashPOSIX.put("cntrl", new Character(RE.POSIX_CLASS_CNTRL));
    hashPOSIX.put("digit", new Character(RE.POSIX_CLASS_DIGIT));
    hashPOSIX.put("graph", new Character(RE.POSIX_CLASS_GRAPH));
    hashPOSIX.put("lower", new Character(RE.POSIX_CLASS_LOWER));
    hashPOSIX.put("print", new Character(RE.POSIX_CLASS_PRINT));
    hashPOSIX.put("punct", new Character(RE.POSIX_CLASS_PUNCT));
    hashPOSIX.put("space", new Character(RE.POSIX_CLASS_SPACE));
    hashPOSIX.put("upper", new Character(RE.POSIX_CLASS_UPPER));
    hashPOSIX.put("xdigit", new Character(RE.POSIX_CLASS_XDIGIT));
    hashPOSIX.put("javastart", new Character(RE.POSIX_CLASS_JSTART));
    hashPOSIX.put("javapart", new Character(RE.POSIX_CLASS_JPART));
  }

  /**
   * Constructor.  Creates (initially empty) storage for a regular expression program.
   */
  public RECompiler() {
    // Start off with a generous, yet reasonable, initial size
    instruction = new char[128];
    lenInstruction = 0;
  }

  /**
   * Ensures that n more characters can fit in the program buffer.
   * If n more can't fit, then the size is doubled until it can.
   *
   * @param n Number of additional characters to ensure will fit.
   */
  void ensure(int n) {
    // Get current program length
    int curlen = instruction.length;

    // If the current length + n more is too much
    if (lenInstruction + n >= curlen) {
      // Double the size of the program array until n more will fit
      while (lenInstruction + n >= curlen) {
        curlen *= 2;
      }

      // Allocate new program array and move data into it
      char[] newInstruction = new char[curlen];
      System.arraycopy(instruction, 0, newInstruction, 0, lenInstruction);
      instruction = newInstruction;
    }
  }

  /**
   * Emit a single character into the program stream.
   *
   * @param c Character to add
   */
  void emit(char c) {
    // Make room for character
    ensure(1);

    // Add character
    instruction[lenInstruction++] = c;
  }

  /**
   * Inserts a node with a given opcode and opdata at insertAt.  The node relative next
   * pointer is initialized to 0.
   *
   * @param opcode Opcode for new node
   * @param opdata Opdata for new node (only the low 16 bits are currently used)
   * @param insertAt Index at which to insert the new node in the program
   */
  void nodeInsert(char opcode, int opdata, int insertAt) {
    // Make room for a new node
    ensure(RE.nodeSize);

    // Move everything from insertAt to the end down nodeSize elements
    System.arraycopy(instruction, insertAt, instruction, insertAt + RE.nodeSize,
        lenInstruction - insertAt);
    instruction[insertAt + RE.offsetOpcode] = opcode;
    instruction[insertAt + RE.offsetOpdata] = (char) opdata;
    instruction[insertAt + RE.offsetNext] = 0;
    lenInstruction += RE.nodeSize;
  }

  /**
   * Appends a node to the end of a node chain
   *
   * @param node Start of node chain to traverse
   * @param pointTo Node to have the tail of the chain point to
   */
  void setNextOfEnd(int node, int pointTo) {
    // Traverse the chain until the next offset is 0
    int next = instruction[node + RE.offsetNext];
    // while the 'node' is not the last in the chain
    // and the 'node' is not the last in the program.
    while (next != 0 && node < lenInstruction) {
      // if the node we are supposed to point to is in the chain then
      // point to the end of the program instead.
      // Michael McCallum <gholam@xtra.co.nz>
      // FIXME: // This is a _hack_ to stop infinite programs.
      // I believe that the implementation of the reluctant matches is wrong but
      // have not worked out a better way yet.
      if (node == pointTo) {
        pointTo = lenInstruction;
      }
      node += next;
      next = instruction[node + RE.offsetNext];
    }
    // if we have reached the end of the program then dont set the pointTo.
    // im not sure if this will break any thing but passes all the tests.
    if (node < lenInstruction) {
      // Point the last node in the chain to pointTo.
      instruction[node + RE.offsetNext] = (char) (short) (pointTo - node);
    }
  }

  /**
   * Adds a new node
   *
   * @param opcode Opcode for node
   * @param opdata Opdata for node (only the low 16 bits are currently used)
   * @return Index of new node in program
   */
  int node(char opcode, int opdata) {
    // Make room for a new node
    ensure(RE.nodeSize);

    // Add new node at end
    instruction[lenInstruction + RE.offsetOpcode] = opcode;
    instruction[lenInstruction + RE.offsetOpdata] = (char) opdata;
    instruction[lenInstruction + RE.offsetNext] = 0;
    lenInstruction += RE.nodeSize;

    // Return index of new node
    return lenInstruction - RE.nodeSize;
  }


  /**
   * Throws a new internal error exception
   *
   * @throws Error Thrown in the event of an internal error.
   */
  void internalError() throws Error {
    throw new Error("Internal error!");
  }

  /**
   * Throws a new syntax error exception
   *
   * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
   */
  void syntaxError(String s) throws RESyntaxException {
    throw new RESyntaxException(s);
  }

  /**
   * Allocate storage for brackets only as needed
   */
  void allocBrackets() {
    // Allocate bracket stacks if not already done
    if (bracketStart == null) {
      // Allocate storage
      bracketStart = new int[maxBrackets];
      bracketEnd = new int[maxBrackets];
      bracketMin = new int[maxBrackets];
      bracketOpt = new int[maxBrackets];

      // Initialize to invalid values
      for (int i = 0; i < maxBrackets; i++) {
        bracketStart[i] = bracketEnd[i] = bracketMin[i] = bracketOpt[i] = -1;
      }
    }
  }

  /**
   * Enlarge storage for brackets only as needed.
   */
  synchronized void reallocBrackets() {
    // trick the tricky
    if (bracketStart == null) {
      allocBrackets();
    }

    int new_size = maxBrackets * 2;
    int[] new_bS = new int[new_size];
    int[] new_bE = new int[new_size];
    int[] new_bM = new int[new_size];
    int[] new_bO = new int[new_size];
    // Initialize to invalid values
    for (int i = brackets; i < new_size; i++) {
      new_bS[i] = new_bE[i] = new_bM[i] = new_bO[i] = -1;
    }
    System.arraycopy(bracketStart, 0, new_bS, 0, brackets);
    System.arraycopy(bracketEnd, 0, new_bE, 0, brackets);
    System.arraycopy(bracketMin, 0, new_bM, 0, brackets);
    System.arraycopy(bracketOpt, 0, new_bO, 0, brackets);
    bracketStart = new_bS;
    bracketEnd = new_bE;
    bracketMin = new_bM;
    bracketOpt = new_bO;
    maxBrackets = new_size;
  }

  /**
   * Match bracket {m,n} expression put results in bracket member variables
   *
   * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
   */
  void bracket() throws RESyntaxException {
    // Current character must be a '{'
    if (idx >= len || pattern.charAt(idx++) != '{') {
      internalError();
    }

    // Next char must be a digit
    if (idx >= len || !Character.isDigit(pattern.charAt(idx))) {
      syntaxError("Expected digit");
    }

    // Get min ('m' of {m,n}) number
    StringBuffer number = new StringBuffer();
    while (idx < len && Character.isDigit(pattern.charAt(idx))) {
      number.append(pattern.charAt(idx++));
    }
    try {
      bracketMin[brackets] = Integer.parseInt(number.toString());
    } catch (NumberFormatException e) {
      syntaxError("Expected valid number");
    }

    // If out of input, fail
    if (idx >= len) {
      syntaxError("Expected comma or right bracket");
    }

    // If end of expr, optional limit is 0
    if (pattern.charAt(idx) == '}') {
      idx++;
      bracketOpt[brackets] = 0;
      return;
    }

    // Must have at least {m,} and maybe {m,n}.
    if (idx >= len || pattern.charAt(idx++) != ',') {
      syntaxError("Expected comma");
    }

    // If out of input, fail
    if (idx >= len) {
      syntaxError("Expected comma or right bracket");
    }

    // If {m,} max is unlimited
    if (pattern.charAt(idx) == '}') {
      idx++;
      bracketOpt[brackets] = bracketUnbounded;
      return;
    }

    // Next char must be a digit
    if (idx >= len || !Character.isDigit(pattern.charAt(idx))) {
      syntaxError("Expected digit");
    }

    // Get max number
    number.setLength(0);
    while (idx < len && Character.isDigit(pattern.charAt(idx))) {
      number.append(pattern.charAt(idx++));
    }
    try {
      bracketOpt[brackets] = Integer.parseInt(number.toString()) - bracketMin[brackets];
    } catch (NumberFormatException e) {
      syntaxError("Expected valid number");
    }

    // Optional repetitions must be >= 0
    if (bracketOpt[brackets] < 0) {
      syntaxError("Bad range");
    }

    // Must have close brace
    if (idx >= len || pattern.charAt(idx++) != '}') {
      syntaxError("Missing close brace");
    }
  }

  /**
   * Match an escape sequence.  Handles quoted chars and octal escapes as well
   * as normal escape characters.  Always advances the input stream by the
   * right amount. This code "understands" the subtle difference between an
   * octal escape and a backref.  You can access the type of ESC_CLASS or
   * ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1].
   *
   * @return ESC_* code or character if simple escape
   * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
   */
  int escape() throws RESyntaxException {
    // "Shouldn't" happen
    if (pattern.charAt(idx) != '\\') {
      internalError();
    }

    // Escape shouldn't occur as last character in string!
    if (idx + 1 == len) {
      syntaxError("Escape terminates string");
    }

    // Switch on character after backslash
    idx += 2;
    char escapeChar = pattern.charAt(idx - 1);
    switch (escapeChar) {
      case RE.E_BOUND:
      case RE.E_NBOUND:
        return ESC_COMPLEX;

      case RE.E_ALNUM:
      case RE.E_NALNUM:
      case RE.E_SPACE:
      case RE.E_NSPACE:
      case RE.E_DIGIT:
      case RE.E_NDIGIT:
        return ESC_CLASS;

      case 'u':
      case 'x': {
        // Exact required hex digits for escape type
        int hexDigits = (escapeChar == 'u' ? 4 : 2);

        // Parse up to hexDigits characters from input
        int val = 0;
        for (; idx < len && hexDigits-- > 0; idx++) {
          // Get char
          char c = pattern.charAt(idx);

          // If it's a hexadecimal digit (0-9)
          if (c >= '0' && c <= '9') {
            // Compute new value
            val = (val << 4) + c - '0';
          } else {
            // If it's a hexadecimal letter (a-f)
            c = Character.toLowerCase(c);
            if (c >= 'a' && c <= 'f') {
              // Compute new value
              val = (val << 4) + (c - 'a') + 10;
            } else {
              // If it's not a valid digit or hex letter, the escape must be invalid
              // because hexDigits of input have not been absorbed yet.
              syntaxError("Expected " + hexDigits + " hexadecimal digits after \\" + escapeChar);
            }
          }
        }
        return val;
      }

      case 't':
        return '\t';

      case 'n':
        return '\n';

      case 'r':
        return '\r';

      case 'f':
        return '\f';

      case '0':
      case '1':
      case '2':
      case '3':
      case '4':
      case '5':
      case '6':
      case '7':
      case '8':
      case '9':

        // An octal escape starts with a 0 or has two digits in a row
        if ((idx < len && Character.isDigit(pattern.charAt(idx))) || escapeChar == '0') {
          // Handle \nnn octal escapes
          int val = escapeChar - '0';
          if (idx < len && Character.isDigit(pattern.charAt(idx))) {
            val = ((val << 3) + (pattern.charAt(idx++) - '0'));
            if (idx < len && Character.isDigit(pattern.charAt(idx))) {
              val = ((val << 3) + (pattern.charAt(idx++) - '0'));
            }
          }
          return val;
        }

        // It's actually a backreference (\[1-9]), not an escape
        return ESC_BACKREF;

      default:

        // Simple quoting of a character
        return escapeChar;
    }
  }

  /**
   * Compile a character class
   *
   * @return Index of class node
   * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
   */
  int characterClass() throws RESyntaxException {
    // Check for bad calling or empty class
    if (pattern.charAt(idx) != '[') {
      internalError();
    }

    // Check for unterminated or empty class
    if ((idx + 1) >= len || pattern.charAt(++idx) == ']') {
      syntaxError("Empty or unterminated class");
    }

    // Check for POSIX character class
    if (idx < len && pattern.charAt(idx) == ':') {
      // Skip colon
      idx++;

      // POSIX character classes are denoted with lowercase ASCII strings
      int idxStart = idx;
      while (idx < len && pattern.charAt(idx) >= 'a' && pattern.charAt(idx) <= 'z') {
        idx++;
      }

      // Should be a ":]" to terminate the POSIX character class
      if ((idx + 1) < len && pattern.charAt(idx) == ':' && pattern.charAt(idx + 1) == ']') {
        // Get character class
        String charClass = pattern.substring(idxStart, idx);

        // Select the POSIX class id
        Character i = (Character) hashPOSIX.get(charClass);
        if (i != null) {
          // Move past colon and right bracket
          idx += 2;

          // Return new POSIX character class node
          return node(RE.OP_POSIXCLASS, i.charValue());
        }
        syntaxError("Invalid POSIX character class '" + charClass + "'");
      }
      syntaxError("Invalid POSIX character class syntax");
    }

    // Try to build a class.  Create OP_ANYOF node
    int ret = node(RE.OP_ANYOF, 0);

    // Parse class declaration
    char CHAR_INVALID = Character.MAX_VALUE;
    char last = CHAR_INVALID;
    char simpleChar = 0;
    boolean include = true;
    boolean definingRange = false;
    int idxFirst = idx;
    char rangeStart = Character.MIN_VALUE;
    char rangeEnd;
    RERange range = new RERange();
    while (idx < len && pattern.charAt(idx) != ']') {

      switchOnCharacter:

      // Switch on character
      switch (pattern.charAt(idx)) {
        case '^':
          include = !include;
          if (idx == idxFirst) {
            range.include(Character.MIN_VALUE, Character.MAX_VALUE, true);
          }
          idx++;
          continue;

        case '\\': {
          // Escape always advances the stream
          int c;
          switch (c = escape()) {
            case ESC_COMPLEX:
            case ESC_BACKREF:

              // Word boundaries and backrefs not allowed in a character class!
              syntaxError("Bad character class");

            case ESC_CLASS:

              // Classes can't be an endpoint of a range
              if (definingRange) {
                syntaxError("Bad character class");
              }

              // Handle specific type of class (some are ok)
              switch (pattern.charAt(idx - 1)) {
                case RE.E_NSPACE:
                case RE.E_NDIGIT:
                case RE.E_NALNUM:
                  syntaxError("Bad character class");

                case RE.E_SPACE:
                  range.include('\t', include);
                  range.include('\r', include);
                  range.include('\f', include);
                  range.include('\n', include);
                  range.include('\b', include);
                  range.include(' ', include);
                  break;

                case RE.E_ALNUM:
                  range.include('a', 'z', include);
                  range.include('A', 'Z', include);
                  range.include('_', include);

                  // Fall through!

                case RE.E_DIGIT:
                  range.include('0', '9', include);
                  break;
              }

              // Make last char invalid (can't be a range start)
              last = CHAR_INVALID;
              break;

            default:

              // Escape is simple so treat as a simple char
              simpleChar = (char) c;
              break switchOnCharacter;
          }
        }
        continue;

        case '-':

          // Start a range if one isn't already started
          if (definingRange) {
            syntaxError("Bad class range");
          }
          definingRange = true;

          // If no last character, start of range is 0
          rangeStart = (last == CHAR_INVALID ? 0 : last);

          // Premature end of range. define up to Character.MAX_VALUE
          if ((idx + 1) < len && pattern.charAt(++idx) == ']') {
            simpleChar = Character.MAX_VALUE;
            break;
          }
          continue;

        default:
          simpleChar = pattern.charAt(idx++);
          break;
      }

      // Handle simple character simpleChar
      if (definingRange) {
        // if we are defining a range make it now
        rangeEnd = simpleChar;

        // Actually create a range if the range is ok
        if (rangeStart >= rangeEnd) {
          syntaxError("Bad character class");
        }
        range.include(rangeStart, rangeEnd, include);

        // We are done defining the range
        last = CHAR_INVALID;
        definingRange = false;
      } else {
        // If simple character and not start of range, include it
        if (idx >= len || pattern.charAt(idx) != '-') {
          range.include(simpleChar, include);
        }
        last = simpleChar;
      }
    }

    // Shouldn't be out of input
    if (idx == len) {
      syntaxError("Unterminated character class");
    }

    // Absorb the ']' end of class marker
    idx++;

    // Emit character class definition
    instruction[ret + RE.offsetOpdata] = (char) range.num;
    for (int i = 0; i < range.num; i++) {
      emit((char) range.minRange[i]);
      emit((char) range.maxRange[i]);
    }
    return ret;
  }

  /**
   * Absorb an atomic character string.  This method is a little tricky because
   * it can un-include the last character of string if a closure operator follows.
   * This is correct because *+? have higher precedence than concatentation (thus
   * ABC* means AB(C*) and NOT (ABC)*).
   *
   * @return Index of new atom node
   * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
   */
  int atom() throws RESyntaxException {
    // Create a string node
    int ret = node(RE.OP_ATOM, 0);

    // Length of atom
    int lenAtom = 0;

    // Loop while we've got input

    atomLoop:

    while (idx < len) {
      // Is there a next char?
      if ((idx + 1) < len) {
        char c = pattern.charAt(idx + 1);

        // If the next 'char' is an escape, look past the whole escape
        if (pattern.charAt(idx) == '\\') {
          int idxEscape = idx;
          escape();
          if (idx < len) {
            c = pattern.charAt(idx);
          }
          idx = idxEscape;
        }

        // Switch on next char
        switch (c) {
          case '{':
          case '?':
          case '*':
          case '+':

            // If the next character is a closure operator and our atom is non-empty, the
            // current character should bind to the closure operator rather than the atom
            if (lenAtom != 0) {
              break atomLoop;
            }
        }
      }

      // Switch on current char
      switch (pattern.charAt(idx)) {
        case ']':
        case '^':
        case '$':
        case '.':
        case '[':
        case '(':
        case ')':
        case '|':
          break atomLoop;

        case '{':
        case '?':
        case '*':
        case '+':

          // We should have an atom by now
          if (lenAtom == 0) {
            // No atom before closure
            syntaxError("Missing operand to closure");
          }
          break atomLoop;

        case '\\':

        {
          // Get the escaped character (advances input automatically)
          int idxBeforeEscape = idx;
          int c = escape();

          // Check if it's a simple escape (as opposed to, say, a backreference)
          if ((c & ESC_MASK) == ESC_MASK) {
            // Not a simple escape, so backup to where we were before the escape.
            idx = idxBeforeEscape;
            break atomLoop;
          }

          // Add escaped char to atom
          emit((char) c);
          lenAtom++;
        }
        break;

        default:

          // Add normal character to atom
          emit(pattern.charAt(idx++));
          lenAtom++;
          break;
      }
    }

    // This "shouldn't" happen
    if (lenAtom == 0) {
      internalError();
    }

    // Emit the atom length into the program
    instruction[ret + RE.offsetOpdata] = (char) lenAtom;
    return ret;
  }

  /**
   * Match a terminal node.
   *
   * @param flags Flags
   * @return Index of terminal node (closeable)
   * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
   */
  int terminal(int[] flags) throws RESyntaxException {
    switch (pattern.charAt(idx)) {
      case RE.OP_EOL:
      case RE.OP_BOL:
      case RE.OP_ANY:
        return node(pattern.charAt(idx++), 0);

      case '[':
        return characterClass();

      case '(':
        return expr(flags);

      case ')':
        syntaxError("Unexpected close paren");

      case '|':
        internalError();

      case ']':
        syntaxError("Mismatched class");

      case 0:
        syntaxError("Unexpected end of input");

      case '?':
      case '+':
      case '{':
      case '*':
        syntaxError("Missing operand to closure");

      case '\\': {
        // Don't forget, escape() advances the input stream!
        int idxBeforeEscape = idx;

        // Switch on escaped character
        switch (escape()) {
          case ESC_CLASS:
          case ESC_COMPLEX:
            flags[0] &= ~NODE_NULLABLE;
            return node(RE.OP_ESCAPE, pattern.charAt(idx - 1));

          case ESC_BACKREF: {
            char backreference = (char) (pattern.charAt(idx - 1) - '0');
            if (parens <= backreference) {
              syntaxError("Bad backreference");
            }
            flags[0] |= NODE_NULLABLE;
            return node(RE.OP_BACKREF, backreference);
          }

          default:

            // We had a simple escape and we want to have it end up in
            // an atom, so we back up and fall though to the default handling
            idx = idxBeforeEscape;
            flags[0] &= ~NODE_NULLABLE;
            break;
        }
      }
    }

    // Everything above either fails or returns.
    // If it wasn't one of the above, it must be the start of an atom.
    flags[0] &= ~NODE_NULLABLE;
    return atom();
  }

  /**
   * Compile a possibly closured terminal
   *
   * @param flags Flags passed by reference
   * @return Index of closured node
   * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
   */
  int closure(int[] flags) throws RESyntaxException {
    // Before terminal
    int idxBeforeTerminal = idx;

    // Values to pass by reference to terminal()
    int[] terminalFlags = {NODE_NORMAL};

    // Get terminal symbol
    int ret = terminal(terminalFlags);

    // Or in flags from terminal symbol
    flags[0] |= terminalFlags[0];

    // Advance input, set NODE_NULLABLE flag and do sanity checks
    if (idx >= len) {
      return ret;
    }
    boolean greedy = true;
    char closureType = pattern.charAt(idx);
    switch (closureType) {
      case '?':
      case '*':

        // The current node can be null
        flags[0] |= NODE_NULLABLE;

      case '+':

        // Eat closure character
        idx++;

      case '{':

        // Don't allow blantant stupidity
        int opcode = instruction[ret + RE.offsetOpcode];
        if (opcode == RE.OP_BOL || opcode == RE.OP_EOL) {
          syntaxError("Bad closure operand");
        }
        if ((terminalFlags[0] & NODE_NULLABLE) != 0) {
          syntaxError("Closure operand can't be nullable");
        }
        break;
    }

    // If the next character is a '?', make the closure non-greedy (reluctant)
    if (idx < len && pattern.charAt(idx) == '?') {
      idx++;
      greedy = false;
    }

    if (greedy) {
      // Actually do the closure now
      switch (closureType) {
        case '{': {
          // We look for our bracket in the list
          boolean found = false;
          int i;
          allocBrackets();
          for (i = 0; i < brackets; i++) {
            if (bracketStart[i] == idx) {
              found = true;
              break;
            }
          }

          // If its not in the list we parse the {m,n}
          if (!found) {
            if (brackets >= maxBrackets) {
              reallocBrackets();
            }
            bracketStart[brackets] = idx;
            bracket();
            bracketEnd[brackets] = idx;
            i = brackets++;
          }

          // Process min first
          if (bracketMin[i]-- > 0) {
            if (bracketMin[i] > 0 || bracketOpt[i] != 0) {
              // Rewind stream and run it through again - more matchers coming
              for (int j = 0; j < brackets; j++) {
                if (j != i && bracketStart[j] < idx
                    && bracketStart[j] >= idxBeforeTerminal) {
                  brackets--;
                  bracketStart[j] = bracketStart[brackets];
                  bracketEnd[j] = bracketEnd[brackets];
                  bracketMin[j] = bracketMin[brackets];
                  bracketOpt[j] = bracketOpt[brackets];
                }
              }

              idx = idxBeforeTerminal;
            } else {
              // Bug #1030: No optinal matches - no need to rewind
              idx = bracketEnd[i];
            }
            break;
          }

          // Do the right thing for maximum ({m,})
          if (bracketOpt[i] == bracketUnbounded) {
            // Drop through now and closure expression.
            // We are done with the {m,} expr, so skip rest
            closureType = '*';
            bracketOpt[i] = 0;
            idx = bracketEnd[i];
          } else if (bracketOpt[i]-- > 0) {
            if (bracketOpt[i] > 0) {
              // More optional matchers - 'play it again sam!'
              idx = idxBeforeTerminal;
            } else {
              // Bug #1030: We are done - this one is last and optional
              idx = bracketEnd[i];
            }
            // Drop through to optionally close
            closureType = '?';
          } else {
            // Rollback terminal - neither min nor opt matchers present
            lenInstruction = ret;
            node(RE.OP_NOTHING, 0);

            // We are done. skip the rest of {m,n} expr
            idx = bracketEnd[i];
            break;
          }
        }

        // Fall through!

        case '?':
        case '*':

          if (!greedy) {
            break;
          }

          if (closureType == '?') {
            // X? is compiled as (X|)
            nodeInsert(RE.OP_BRANCH, 0, ret);                 // branch before X
            setNextOfEnd(ret, node(RE.OP_BRANCH, 0));        // inserted branch to option
            int nothing = node(RE.OP_NOTHING, 0);            // which is OP_NOTHING
            setNextOfEnd(ret, nothing);                       // point (second) branch to OP_NOTHING
            setNextOfEnd(ret + RE.nodeSize,
                nothing);         // point the end of X to OP_NOTHING node
          }

          if (closureType == '*') {
            // X* is compiled as (X{gotoX}|)
            nodeInsert(RE.OP_BRANCH, 0, ret);                         // branch before X
            setNextOfEnd(ret + RE.nodeSize,
                node(RE.OP_BRANCH, 0));   // end of X points to an option
            setNextOfEnd(ret + RE.nodeSize, node(RE.OP_GOTO, 0));     // to goto
            setNextOfEnd(ret + RE.nodeSize, ret);                     // the start again
            setNextOfEnd(ret, node(RE.OP_BRANCH, 0));                 // the other option is
            setNextOfEnd(ret, node(RE.OP_NOTHING, 0));                // OP_NOTHING
          }
          break;

        case '+': {
          // X+ is compiled as X({gotoX}|)
          int branch;
          branch = node(RE.OP_BRANCH, 0);                   // a new branch
          setNextOfEnd(ret, branch);                        // is added to the end of X
          setNextOfEnd(node(RE.OP_GOTO, 0), ret);           // one option is to go back to the start
          setNextOfEnd(branch, node(RE.OP_BRANCH, 0));      // the other option
          setNextOfEnd(ret, node(RE.OP_NOTHING, 0));        // is OP_NOTHING
        }
        break;
      }
    } else {
      // Add end after closured subexpr
      setNextOfEnd(ret, node(RE.OP_END, 0));

      // Actually do the closure now
      switch (closureType) {
        case '?':
          nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret);
          break;

        case '*':
          nodeInsert(RE.OP_RELUCTANTSTAR, 0, ret);
          break;

        case '+':
          nodeInsert(RE.OP_RELUCTANTPLUS, 0, ret);
          break;
      }

      // Point to the expr after the closure
      setNextOfEnd(ret, lenInstruction);
    }
    return ret;
  }

  /**
   * Compile one branch of an or operator (implements concatenation)
   *
   * @param flags Flags passed by reference
   * @return Pointer to branch node
   * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
   */
  int branch(int[] flags) throws RESyntaxException {
    // Get each possibly closured piece and concat
    int node;
    int ret = node(RE.OP_BRANCH, 0);
    int chain = -1;
    int[] closureFlags = new int[1];
    boolean nullable = true;
    while (idx < len && pattern.charAt(idx) != '|' && pattern.charAt(idx) != ')') {
      // Get new node
      closureFlags[0] = NODE_NORMAL;
      node = closure(closureFlags);
      if (closureFlags[0] == NODE_NORMAL) {
        nullable = false;
      }

      // If there's a chain, append to the end
      if (chain != -1) {
        setNextOfEnd(chain, node);
      }

      // Chain starts at current
      chain = node;
    }

    // If we don't run loop, make a nothing node
    if (chain == -1) {
      node(RE.OP_NOTHING, 0);
    }

    // Set nullable flag for this branch
    if (nullable) {
      flags[0] |= NODE_NULLABLE;
    }
    return ret;
  }

  /**
   * Compile an expression with possible parens around it.  Paren matching
   * is done at this level so we can tie the branch tails together.
   *
   * @param flags Flag value passed by reference
   * @return Node index of expression in instruction array
   * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
   */
  int expr(int[] flags) throws RESyntaxException {
    // Create open paren node unless we were called from the top level (which has no parens)
    int paren = -1;
    int ret = -1;
    int closeParens = parens;
    if ((flags[0] & NODE_TOPLEVEL) == 0 && pattern.charAt(idx) == '(') {
      // if its a cluster ( rather than a proper subexpression ie with backrefs )
      if (idx + 2 < len && pattern.charAt(idx + 1) == '?' && pattern.charAt(idx + 2) == ':') {
        paren = 2;
        idx += 3;
        ret = node(RE.OP_OPEN_CLUSTER, 0);
      } else {
        paren = 1;
        idx++;
        ret = node(RE.OP_OPEN, parens++);
      }
    }
    flags[0] &= ~NODE_TOPLEVEL;

    // Create a branch node
    int branch = branch(flags);
    if (ret == -1) {
      ret = branch;
    } else {
      setNextOfEnd(ret, branch);
    }

    // Loop through branches
    while (idx < len && pattern.charAt(idx) == '|') {
      idx++;
      branch = branch(flags);
      setNextOfEnd(ret, branch);
    }

    // Create an ending node (either a close paren or an OP_END)
    int end;
    if (paren > 0) {
      if (idx < len && pattern.charAt(idx) == ')') {
        idx++;
      } else {
        syntaxError("Missing close paren");
      }
      if (paren == 1) {
        end = node(RE.OP_CLOSE, closeParens);
      } else {
        end = node(RE.OP_CLOSE_CLUSTER, 0);
      }
    } else {
      end = node(RE.OP_END, 0);
    }

    // Append the ending node to the ret nodelist
    setNextOfEnd(ret, end);

    // Hook the ends of each branch to the end node
    int currentNode = ret;
    int nextNodeOffset = instruction[currentNode + RE.offsetNext];
    // while the next node o
    while (nextNodeOffset != 0 && currentNode < lenInstruction) {
      // If branch, make the end of the branch's operand chain point to the end node.
      if (instruction[currentNode + RE.offsetOpcode] == RE.OP_BRANCH) {
        setNextOfEnd(currentNode + RE.nodeSize, end);
      }
      nextNodeOffset = instruction[currentNode + RE.offsetNext];
      currentNode += nextNodeOffset;
    }

    // Return the node list
    return ret;
  }

  /**
   * Compiles a regular expression pattern into a program runnable by the pattern
   * matcher class 'RE'.
   *
   * @param pattern Regular expression pattern to compile (see RECompiler class for details).
   * @return A compiled regular expression program.
   * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
   * @see RECompiler
   * @see RE
   */
  public REProgram compile(String pattern) throws RESyntaxException {
    // Initialize variables for compilation
    this.pattern = pattern;                         // Save pattern in instance variable
    len = pattern.length();                         // Precompute pattern length for speed
    idx = 0;                                        // Set parsing index to the first character
    lenInstruction = 0;                             // Set emitted instruction count to zero
    parens = 1;                                     // Set paren level to 1 (the implicit outer parens)
    brackets = 0;                                   // No bracketed closures yet

    // Initialize pass by reference flags value
    int[] flags = {NODE_TOPLEVEL};

    // Parse expression
    expr(flags);

    // Should be at end of input
    if (idx != len) {
      if (pattern.charAt(idx) == ')') {
        syntaxError("Unmatched close paren");
      }
      syntaxError("Unexpected input remains");
    }

    // Return the result
    char[] ins = new char[lenInstruction];
    System.arraycopy(instruction, 0, ins, 0, lenInstruction);
    return new REProgram(parens, ins);
  }

  /**
   * Local, nested class for maintaining character ranges for character classes.
   */
  class RERange {

    int size = 16;                      // Capacity of current range arrays
    int[] minRange = new int[size];     // Range minima
    int[] maxRange = new int[size];     // Range maxima
    int num = 0;                        // Number of range array elements in use

    /**
     * Deletes the range at a given index from the range lists
     *
     * @param index Index of range to delete from minRange and maxRange arrays.
     */
    void delete(int index) {
      // Return if no elements left or index is out of range
      if (num == 0 || index >= num) {
        return;
      }

      // Move elements down
      while (++index < num) {
        if (index - 1 >= 0) {
          minRange[index - 1] = minRange[index];
          maxRange[index - 1] = maxRange[index];
        }
      }

      // One less element now
      num--;
    }

    /**
     * Merges a range into the range list, coalescing ranges if possible.
     *
     * @param min Minimum end of range
     * @param max Maximum end of range
     */
    void merge(int min, int max) {
      // Loop through ranges
      for (int i = 0; i < num; i++) {
        // Min-max is subsumed by minRange[i]-maxRange[i]
        if (min >= minRange[i] && max <= maxRange[i]) {
          return;
        }

        // Min-max subsumes minRange[i]-maxRange[i]
        else if (min <= minRange[i] && max >= maxRange[i]) {
          delete(i);
          merge(min, max);
          return;
        }

        // Min is in the range, but max is outside
        else if (min >= minRange[i] && min <= maxRange[i]) {
          delete(i);
          min = minRange[i];
          merge(min, max);
          return;
        }

        // Max is in the range, but min is outside
        else if (max >= minRange[i] && max <= maxRange[i]) {
          delete(i);
          max = maxRange[i];
          merge(min, max);
          return;
        }
      }

      // Must not overlap any other ranges
      if (num >= size) {
        size *= 2;
        int[] newMin = new int[size];
        int[] newMax = new int[size];
        System.arraycopy(minRange, 0, newMin, 0, num);
        System.arraycopy(maxRange, 0, newMax, 0, num);
        minRange = newMin;
        maxRange = newMax;
      }
      minRange[num] = min;
      maxRange[num] = max;
      num++;
    }

    /**
     * Removes a range by deleting or shrinking all other ranges
     *
     * @param min Minimum end of range
     * @param max Maximum end of range
     */
    void remove(int min, int max) {
      // Loop through ranges
      for (int i = 0; i < num; i++) {
        // minRange[i]-maxRange[i] is subsumed by min-max
        if (minRange[i] >= min && maxRange[i] <= max) {
          delete(i);
          i--;
          return;
        }

        // min-max is subsumed by minRange[i]-maxRange[i]
        else if (min >= minRange[i] && max <= maxRange[i]) {
          int minr = minRange[i];
          int maxr = maxRange[i];
          delete(i);
          if (minr < min) {
            merge(minr, min - 1);
          }
          if (max < maxr) {
            merge(max + 1, maxr);
          }
          return;
        }

        // minRange is in the range, but maxRange is outside
        else if (minRange[i] >= min && minRange[i] <= max) {
          minRange[i] = max + 1;
          return;
        }

        // maxRange is in the range, but minRange is outside
        else if (maxRange[i] >= min && maxRange[i] <= max) {
          maxRange[i] = min - 1;
          return;
        }
      }
    }

    /**
     * Includes (or excludes) the range from min to max, inclusive.
     *
     * @param min Minimum end of range
     * @param max Maximum end of range
     * @param include True if range should be included.  False otherwise.
     */
    void include(int min, int max, boolean include) {
      if (include) {
        merge(min, max);
      } else {
        remove(min, max);
      }
    }

    /**
     * Includes a range with the same min and max
     *
     * @param minmax Minimum and maximum end of range (inclusive)
     * @param include True if range should be included.  False otherwise.
     */
    void include(char minmax, boolean include) {
      include(minmax, minmax, include);
    }
  }
}
